# Polarisation Experiment: Data Analysis (experiment 1)
# Step 1: Clean Data
# Authors: Mark Kayser & Kasia Nalewajko
# First created: Oct 30, 2023
# Replicated: Nov 18, 2025

rm(list = ls())

# LOAD PACKAGES -----------------------------------------------------------

if (!require("dplyr")) install.packages("dplyr")
if (!require("tidyr")) install.packages("tidyr")
if (!require("readxl")) install.packages("readxl")
if (!require("stringr")) install.packages("stringr")
if (!require("lubridate")) install.packages("lubridate")
if (!require("haven")) install.packages("haven")
if (!require("ggplot2")) install.packages("ggplot2")

# LOAD DATA ----

setwd("set/path/to/folder")

results <- read_excel("01 data/01 raw data/study1/LRAF19278_Hancza_2023_2.xlsx") %>% 
  dplyr::rename(
    # experiment components
    which_first = V66100010_1,
    exp_condition = P16_LOS,
    outcome = P17,
    manipulation_check1 = Conduct_manipulation_Check1,
    manipulation_check2 = Conduct_manipulation_check2,
    experiment_interpretation1 = P20_1,
    experiment_interpretation2 = P20_2,
    experiment_interpretation3 = P20_3,
    experiment_interpretation4 = P20_4,
    experiment_interpretation5 = P20_5,
    
    # party identification
    presidential_first_round = P4,
    presidential_second_round = P5,
    parliamentary = P7,
    party_id = P8,
    party_id_strength = P9,
    PiS_or_PO = P10,
    PiS_or_PO_strength = P11,
    political_ideology_GS2020 = P12, # from Graham & Svolik 2020 (henceforth: GS2020)
    party_id_PiS1_GS2020 = P13a_1, # GS2020 battery of 6 qs, PiS supporters version
    party_id_PiS2_GS2020 = P13a_2,
    party_id_PiS3_GS2020 = P13a_3,
    party_id_PiS4_GS2020 = P13a_4,
    party_id_PiS5_GS2020 = P13a_5,
    party_id_PiS6_GS2020 = P13a_6,
    party_id_PO1_GS2020 = P13b_1, # GS2020 battery of 6 qs, PO supporters version
    party_id_PO2_GS2020 = P13b_2,
    party_id_PO3_GS2020 = P13b_3,
    party_id_PO4_GS2020 = P13b_4,
    party_id_PO5_GS2020 = P13b_5,
    party_id_PO6_GS2020 = P13b_6,
    LR = V99,
    
    # controls
    yob = age,
    gender = sex,
    locality_size = Qxcla,
    voivodeship = Qxvoi,
    education = Edu,
    survey_date = STIME
  ) %>% 
  mutate(age = 2023-yob,
         id = seq.int(nrow(.))) %>% 
  dplyr::select(id,
                which_first,
                exp_condition,
                outcome,
                manipulation_check1,
    manipulation_check2,
    experiment_interpretation1,
    experiment_interpretation2,
    experiment_interpretation3,
    experiment_interpretation4,
    experiment_interpretation5,
    LR,
                presidential_first_round,
                presidential_second_round,
                parliamentary,
                party_id,
                party_id_strength,
                PiS_or_PO,
                PiS_or_PO_strength,
                political_ideology_GS2020,
                party_id_PiS1_GS2020,
                party_id_PiS2_GS2020,
                party_id_PiS3_GS2020,
                party_id_PiS4_GS2020,
                party_id_PiS5_GS2020,
                party_id_PiS6_GS2020,
                party_id_PO1_GS2020,
                party_id_PO2_GS2020,
                party_id_PO3_GS2020,
                party_id_PO4_GS2020,
                party_id_PO5_GS2020,
                party_id_PO6_GS2020,
                yob,
                age,
                gender,
                locality_size,
                voivodeship,
                education,
                survey_date
  )

# INSPECT DATA ----

# check patterns of missingness
missing_summary <- results %>% summarise(across(everything(), ~ sum(is.na(.))))

# check balance between experimental conditions
results %>% 
  group_by(exp_condition) %>% 
  summarise(n = n())

# CLEAN DATA ----

# ADJUST LEVELS SO THAT THEY MAKE SUBSTANTIVE SENSE ----
results$exp_condition <- ifelse(results$exp_condition == 1, "PiSwins", results$exp_condition)
results$exp_condition <- ifelse(results$exp_condition == 2, "POwins", results$exp_condition)
results$exp_condition <- ifelse(results$exp_condition == 3, "control", results$exp_condition)
results$exp_condition <- relevel(as.factor(results$exp_condition), ref = "control")

# RECODE LEFT-RIGHT MEASURE ----

results$LR <- ifelse(results$LR > 10, NA, results$LR)

# RECODE WHICH_FIRST MEASURE ----

results <- results %>% 
  rename(experiment_first = which_first) %>% 
  mutate(experiment_first = experiment_first-1)

# CREATE A FACTOR "PO/PIS" VARIABLE WITH RELATED STRENGTH OF PO/PIS SUPPORT (coded 1-5) ----

# what party do you identify with
results$popis <- ifelse(results$party_id == 4, "PiS", NA)
results$popis <- ifelse(results$party_id == 16, "PO", results$popis)

results %>% 
  group_by(popis) %>% 
  summarise(n = n()) # 616 respondents identified

# whom are you planning on voting for in parliamentary elections in 2023
results$popis <- ifelse(results$parliamentary == 4 & is.na(results$popis), "PiS", results$popis)
results$popis <- ifelse(results$parliamentary == 6 & is.na(results$popis), "PO", results$popis)

results %>% 
  group_by(popis) %>% 
  summarise(n = n()) # 120 additional respondents identified

# whom did you vote for in the first round of 2020 presidential elections (CHANGE TO "ROUND")
results$popis <- ifelse(results$presidential_first_round == 1 & is.na(results$popis), "PiS", results$popis)
results$popis <- ifelse(results$presidential_first_round == 2 & is.na(results$popis), "PO", results$popis)

results %>% 
  group_by(popis) %>% 
  summarise(n = n()) # 189 additional respondents identified

# whom did you vote for in the second round of 2020 presidential elections
results$popis <- ifelse(results$presidential_second_round == 1 & is.na(results$popis), "PiS", results$popis)
results$popis <- ifelse(results$presidential_second_round == 2 & is.na(results$popis), "PO", results$popis)

results %>% 
  group_by(popis) %>% 
  summarise(n = n())  # 137 additional respondents identified

results$forced_POPiS_choice <- ifelse(results$PiS_or_PO < 3 & is.na(results$popis), 1, 0) # 56 people were "forced to choose"
results$never_chooses_POPiS <- ifelse(results$PiS_or_PO == 999 & is.na(results$popis), 1, 0) # 80 people didn't agree to choose

# if you had to choose...
results$popis <- ifelse(results$PiS_or_PO == 1 & is.na(results$popis), "PiS", results$popis)
results$popis <- ifelse(results$PiS_or_PO == 2 & is.na(results$popis), "PO", results$popis)
# The survey company Kantar made a mistake. They didn't ask the "If you had to choose..." question to people who checked in "3" or "15" in the ID question (instead of "4" and "16"). I am assuming that Kukiz supporters ("3") prefer PiS to PO, which allows me to identify fortunately only further 2 people. Number "15" is "Partia Kierowców" which couldn't be so easily translated to either PO or PiS... Fortunately, there are only 3 respondents who identify with it, so I drop them altogether.
results$popis <- ifelse(results$party_id == 3, "PiS", results$popis) # 

results %>% 
  group_by(popis) %>% 
  summarise(n = n())  # 58 additional respondents identified

results$popis <- as.factor(results$popis)

sum(results$forced_POPiS_choice, na.rm = T)
sum(results$never_chooses_POPiS, na.rm = T)



# CALCULATE STRENGTH OF PARTISANSHIP (Graham and Svolik) -------

results$conservative <- results$political_ideology_GS2020
results$conservative <- ifelse(results$conservative > 7, NA, results$conservative)

results %>% 
  group_by(conservative) %>% 
  summarise(n = n())

results$GSparty_id1 <- results$party_id_PiS1_GS2020
results$GSparty_id1 <- ifelse(is.na(results$GSparty_id1), results$party_id_PO1_GS2020, results$GSparty_id1)

results$GSparty_id2 <- results$party_id_PiS2_GS2020
results$GSparty_id2 <- ifelse(is.na(results$GSparty_id2), results$party_id_PO2_GS2020, results$GSparty_id2)

results$GSparty_id3 <- results$party_id_PiS3_GS2020
results$GSparty_id3 <- ifelse(is.na(results$GSparty_id3), results$party_id_PO3_GS2020, results$GSparty_id3)

results$GSparty_id4 <- results$party_id_PiS4_GS2020
results$GSparty_id4 <- ifelse(is.na(results$GSparty_id4), results$party_id_PO4_GS2020, results$GSparty_id4)

results$GSparty_id5 <- results$party_id_PiS5_GS2020
results$GSparty_id5 <- ifelse(is.na(results$GSparty_id5), results$party_id_PO5_GS2020, results$GSparty_id5)

results$GSparty_id6 <- results$party_id_PiS6_GS2020
results$GSparty_id6 <- ifelse(is.na(results$GSparty_id6), results$party_id_PO6_GS2020, results$GSparty_id6)

results <- results %>% 
  rowwise() %>% 
  mutate(GSparty_id = sum(across(c(GSparty_id2, GSparty_id3, GSparty_id4, GSparty_id5, GSparty_id6)), na.rm = T))

results <- results %>% 
  mutate(GSparty_id = GSparty_id-GSparty_id1)

results %>% 
  group_by(GSparty_id) %>% 
  summarise(n = n())

sum(is.na(results$GSparty_id))

results %>% 
  ggplot() +
  aes(x = GSparty_id) +
  geom_bar()

# CREATE NEGATIVE PARTISANSHIP BASED ON Graham and Svolik (Q4 + Q6) ------

# inspect
results$GSparty_id4
results$GSparty_id6 # the higher the score, the more hateful of the rival they are

results <- results %>% 
  rowwise() %>% 
  mutate(GSnegative_partisan = sum(GSparty_id4, GSparty_id6, na.rm = T))
results$GSnegative_partisan <- ifelse(is.na(results$GSparty_id), NA, results$GSnegative_partisan)

# STANDARDISE THE "STRENGTH OF PARTISANSHIP" MEASURES -------

zscore <- function(x, ...) {(x - mean(x, ...)) / sd(x, ...)}

results$conservative_z <- zscore(results$conservative, na.rm = T)
results$GSparty_id_z <- zscore(results$GSparty_id, na.rm = T)
results$GSnegative_partisan_z <- zscore(results$GSnegative_partisan, na.rm = T)

# CORRECT VARIABLE CLASSES FOR REGRESSIONS -------

results$voivodeship <- as.factor(results$voivodeship)
results$yob <- as.numeric(results$yob)
results$locality_size <- as.numeric(results$locality_size)
results$education <- as.numeric(results$education)

# RECODE GENDER VARIABLE --------------

results$female <- results$gender-1

# CREATE TWO SAMPLES "PO supporters" only AND "PiS supporters" only AND SAVE SEPARATELY ------

results_POsupporters <- results %>% 
  filter(popis == "PO")

results_PiSsupporters <- results %>% 
  filter(popis == "PiS")

# SAVE CLEAN DATA -------

save(results, file = "./01 data/02 generated data/study1/01results_fullsample_clean.Rda")




